# $Id: GenBank.pm,v 1.9 2003/09/09 21:29:00 lstein Exp $
#
# BioPerl module for Bio::DB::Query::GenBank.pm
#
# Cared for by Lincoln Stein <lstein@cshl.org>
#
# Copyright Lincoln Stein
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
#

=head1 NAME

Bio::DB::Query::GenBank - Build a GenBank Entrez Query

=head1 SYNOPSIS

   my $query_string = 'Oryza[Organism] AND EST[Keyword]';
   my $query = Bio::DB::Query::GenBank->new(-db=>'nucleotide',
                                            -query=>$query_string,
					    -mindate => '2001',
					    -maxdate => '2002');
   my $count = $query->count;
   my @ids   = $query->ids;

   # get a genbank database handle
   my $gb = new Bio::DB::GenBank;
   my $stream = $gb->get_Stream_by_query($query);
   while (my $seq = $stream->next_seq) {
      # do something with the sequence object
   }

   # initialize the list yourself
   my $query = Bio::DB::Query::GenBank->new(-ids=>[195052,2981014,11127914]);


=head1 DESCRIPTION

This class encapsulates NCBI Entrez queries.  It can be used to store
a list of GI numbers, to translate an Entrez query expression into a
list of GI numbers, or to count the number of terms that would be
returned by a query.  Once created, the query object can be passed to
a Bio::DB::GenBank object in order to retrieve the entries
corresponding to the query.

=head1 FEEDBACK

=head2 Mailing Lists

User feedback is an integral part of the
evolution of this and other Bioperl modules. Send
your comments and suggestions preferably to one
of the Bioperl mailing lists. Your participation
is much appreciated.

  bioperl-l@bioperl.org              - General discussion
  http://bioperl.org/MailList.shtml  - About the mailing lists

=head2 Reporting Bugs

Report bugs to the Bioperl bug tracking system to
help us keep track the bugs and their resolution.
Bug reports can be submitted via email or the
web:

  bioperl-bugs@bio.perl.org
  http://bugzilla.bioperl.org/

=head1 AUTHOR - Lincoln Stein

Email lstein@cshl.org

=head1 APPENDIX

The rest of the documentation details each of the
object methods. Internal methods are usually
preceded with a _

=cut

# Let the code begin...

package Bio::DB::Query::GenBank;
use strict;
use Bio::DB::Query::WebQuery;
use URI::Escape 'uri_unescape';

use constant EPOST               => 'http://www.ncbi.nih.gov/entrez/eutils/epost.fcgi';
use constant ESEARCH             => 'http://www.ncbi.nih.gov/entrez/eutils/esearch.fcgi';
use constant DEFAULT_DB          => 'protein';
use constant MAXENTRY            => 100;

use vars qw(@ISA @ATTRIBUTES);

@ISA     = 'Bio::DB::Query::WebQuery';

BEGIN {
  @ATTRIBUTES = qw(db reldate mindate maxdate datetype);
  for my $method (@ATTRIBUTES) {
    eval <<END;
sub $method {
   my \$self = shift;
   my \$d    = \$self->{'_$method'};
   \$self->{'_$method'} = shift if \@_;
   \$d;
}
END
  }
}

=head2 new

 Title   : new
 Usage   : $db = Bio::DB::Query::GenBank->new(@args)
 Function: create new query object
 Returns : new query object
 Args    : -db       database ('protein' or 'nucleotide')
           -query    query string
           -mindate  minimum date to retrieve from
           -maxdate  maximum date to retrieve from
           -reldate  relative date to retrieve from (days)
           -datetype date field to use ('edat' or 'mdat')
           -ids      array ref of gids (overrides query)

This method creates a new query object.  Typically you will specify a
-db and a -query argument, possibly modified by -mindate, -maxdate, or
-reldate.  -mindate and -maxdate specify minimum and maximum dates for
entries you are interested in retrieving, expressed in the form
DD/MM/YYYY.  -reldate is used to fetch entries that are more recent
than the indicated number of days.

If you provide an array reference of IDs in -ids, the query will be
ignored and the list of IDs will be used when the query is passed to a
Bio::DB::GenBank object's get_Stream_by_query() method.  A variety of
IDs are automatically recognized, including GI numbers, Accession
numbers, Accession.version numbers and locus names.

=cut

sub new {
  my $class = shift;
  my $self  = $class->SUPER::new(@_);
  my ($query,$db,$reldate,$mindate,$maxdate,$datetype,$ids)
    = $self->_rearrange([qw(QUERY DB RELDATE MINDATE MAXDATE DATETYPE IDS)],@_);
  $self->db($db || DEFAULT_DB);
  $reldate  && $self->reldate($reldate);
  $mindate  && $self->mindate($mindate);
  $maxdate  && $self->maxdate($maxdate);
  $datetype ||= 'mdat';
  $datetype && $self->datetype($datetype);
  $self;
}

=head2 cookie

 Title   : cookie
 Usage   : ($cookie,$querynum) = $db->cookie
 Function: return the NCBI query cookie
 Returns : list of (cookie,querynum)
 Args    : none

NOTE: this information is used by Bio::DB::GenBank in
conjunction with efetch.

=cut

sub cookie {
  my $self = shift;
  if (@_) {
    $self->{'_cookie'}   = shift;
    $self->{'_querynum'} = shift;
  }

  else {
    $self->_run_query;
    @{$self}{qw(_cookie _querynum)};
  }
}

=head2 _request_parameters

 Title   : _request_parameters
 Usage   : ($method,$base,@params = $db->_request_parameters
 Function: return information needed to construct the request
 Returns : list of method, url base and key=>value pairs
 Args    : none

=cut

sub _request_parameters {
  my $self = shift;
  my ($method,$base);
  my @params = map {eval("\$self->$_") ? ($_ => eval("\$self->$_")) : () } @ATTRIBUTES;
  push @params,('usehistory'=>'y','tool'=>'bioperl');
  $method = 'get';
  $base   = ESEARCH;
  push @params,('term'   => $self->query);
  push @params,('retmax' => $self->{'_count'} || MAXENTRY);
  ($method,$base,@params);
}


=head2 count

 Title   : count
 Usage   : $count = $db->count;
 Function: return count of number of entries retrieved by query
 Returns : integer
 Args    : none

Returns the number of entries that are matched by the query.

=cut

sub count   {
  my $self = shift;
  if (@_) {
    my $d = $self->{'_count'};
    $self->{'_count'}   = shift;
    return $d;
  }
  else {
    $self->_run_query;
    return $self->{'_count'};
  }
}

=head2 ids

 Title   : ids
 Usage   : @ids = $db->ids([@ids])
 Function: get/set matching ids
 Returns : array of sequence ids
 Args    : (optional) array ref with new set of ids

=cut

=head2 query

 Title   : query
 Usage   : $query = $db->query([$query])
 Function: get/set query string
 Returns : string
 Args    : (optional) new query string

=cut

=head2 _parse_response

 Title   : _parse_response
 Usage   : $db->_parse_response($content)
 Function: parse out response
 Returns : empty
 Args    : none
 Throws  : 'unparseable output exception'

=cut

sub _parse_response {
  my $self    = shift;
  my $content = shift;
  if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
    warn "Warning(s) from GenBank: $warning\n";
  }
  if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
    $self->throw("Error from Genbank: $error");
  }

  my ($count) = $content =~  /<Count>(\d+)/;
  my ($max)   = $content =~  /<RetMax>(\d+)/;
  my $truncated = $count > $max;
  $self->count($count);
  if (!$truncated) {
    my @ids = $content =~ /<Id>(\d+)/g;
    $self->ids(\@ids);
  }
  $self->_truncated($truncated);
  my ($cookie)    = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
  my ($querykey)  = $content =~ m!<QueryKey>(\d+)!;
  $self->cookie(uri_unescape($cookie),$querykey);
}

1;
